This notebook is the PokemonGo project for the course ‘Customer Analytics’
Import the libraries needed
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggplot2)
library(tidyr)
library(caTools)
Import and read the data
# setwd("~/Desktop/MSc Data Analytics & AI/Semester 2 /Customer Analytics/Pokemon Go case")
setwd("C:/Users/Charles-Louis/Desktop/MSc/2 - Customer Analytics/PokemonGo")
# 1 - Customerdata: contains player-related information
customer = read.csv("customerdata.csv" )
# 2 - Summerfintrx & Fallfintrx: contain transaction-related information
summer_trx = read.csv("summerfintrx.csv" )
fall_trx = read.csv("fallfintrx.csv" )
# 3 - SummersessTRX and FallsessTRX: contain session-related information
fall_sess = read.csv("fallsesstrx.csv" )
summer_sess = read.csv("summersesstrx.csv")
Convert Date fields into Date format
customer$Registrationdate = as.Date(customer$Registrationdate,"%Y-%m-%d")
summer_sess$Date = as.Date(summer_sess$Date,"%Y-%m-%d")
fall_sess$Date = as.Date(fall_sess$Date,"%Y-%m-%d")
summer_trx$Date = as.Date(summer_trx$Date,"%Y-%m-%d")
fall_trx$Date = as.Date(fall_trx$Date,"%Y-%m-%d")
Quick analysis of the customer table with a summary
summary(customer)
## X CustomerID CustomerType Registrationdate
## Min. : 1 Min. : 1 Min. :1.000 Min. :2016-07-01
## 1st Qu.:1251 1st Qu.:1251 1st Qu.:1.000 1st Qu.:2016-12-08
## Median :2500 Median :2500 Median :2.000 Median :2018-01-01
## Mean :2500 Mean :2500 Mean :2.497 Mean :2017-08-13
## 3rd Qu.:3750 3rd Qu.:3750 3rd Qu.:4.000 3rd Qu.:2018-03-02
## Max. :5000 Max. :5000 Max. :4.000 Max. :2018-04-30
## Sex Age fallbonus Income
## Min. :0.0000 Min. : 6.00 Min. :0.0 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:18.00 1st Qu.:0.0 1st Qu.:1.000
## Median :0.0000 Median :27.00 Median :0.0 Median :2.000
## Mean :0.4038 Mean :27.51 Mean :0.2 Mean :2.011
## 3rd Qu.:1.0000 3rd Qu.:33.00 3rd Qu.:0.0 3rd Qu.:3.000
## Max. :1.0000 Max. :80.00 Max. :1.0 Max. :3.000
#We need to create a basetable with only active customer from the summer ( it's why I didn't merge summer and fall transaction/ session )
basetable_summer = summer_sess %>% select(CustomerID) %>% distinct(CustomerID)
basetable_summer = basetable_summer %>%
left_join(customer, by = 'CustomerID') %>%
select(-X) %>%
arrange(CustomerID)
basetable_summer
We assign monetary values to each ProductID in transaction dataframes
summer_trx<- summer_trx %>% mutate(Value = c(2.99, 4.99, 9.99, 25, 99)[ProductID])
fall_trx <- fall_trx %>% mutate(Value = c(2.99, 4.99, 9.99, 25, 99)[ProductID])
We define the end date for the period ( here we are studying summer)
enddate_summer <- as.Date("2018-08-31","%Y-%m-%d")
We are going to evaluate the frequency and recency based on the sessions, then we will evaluate the monetary value based on the transaction dataset. We also calculate the Recency and Frequency of transactions as additional information.
summer_sess_RF <- summer_sess %>% group_by(CustomerID) %>%
summarise(Recency = as.numeric(enddate_summer-max(Date)),
Frequency = n() ) %>%
ungroup()
summer_trx_RFM <- summer_trx %>% group_by(CustomerID) %>%
summarise(Recency_Transac = as.numeric(enddate_summer-max(Date)),
Frequency_Transac = n(),
Monetary = sum(Value)) %>%
ungroup()
Here we get the RFM for each summer user (PAID or NOT)
summer_RFM = merge(summer_sess_RF,summer_trx_RFM,by="CustomerID",all.x=TRUE)
summer_RFM[is.na(summer_RFM$Monetary),"Monetary"] <- 0
summer_RFM[is.na(summer_RFM$Frequency_Transac),"Frequency_Transac"] <- 0
summary(summer_RFM)
## CustomerID Recency Frequency Recency_Transac
## Min. : 1 Min. : 0.00 Min. : 1.000 Min. : 0.00
## 1st Qu.:1252 1st Qu.: 8.00 1st Qu.: 2.000 1st Qu.: 26.00
## Median :2506 Median : 21.00 Median : 4.000 Median : 54.00
## Mean :2502 Mean : 29.82 Mean : 4.243 Mean : 56.27
## 3rd Qu.:3751 3rd Qu.: 43.00 3rd Qu.: 6.000 3rd Qu.: 85.00
## Max. :5000 Max. :121.00 Max. :18.000 Max. :121.00
## NA's :3006
## Frequency_Transac Monetary
## Min. :0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.: 0.000
## Median :0.000 Median : 0.000
## Mean :0.473 Mean : 4.179
## 3rd Qu.:1.000 3rd Qu.: 4.990
## Max. :5.000 Max. :200.990
##
summer_RFM[is.na(summer_RFM$Recency_Transac),"Recency_Transac"] <- -1
summer_RFM
Here we are joining the RFM table with the customer informations
basetable_summer = summer_RFM %>%
left_join(customer, by = 'CustomerID') %>%
select(-X)
basetable_summer
Here we join the previous information with the customer session information and the number of transactions
summer_sess_cust_stat <- summer_sess %>% group_by(CustomerID) %>% select(-c(X, PlayID, Date)) %>%
summarise_all(funs(sum))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.)
##
## # After:
## list(name = ~f(.))
## This warning is displayed once per session.
summer_sess_cust_nb <- summer_sess %>% group_by(CustomerID) %>% summarise(NbSess = n())
summer_trx_cust_stat <- summer_trx %>% group_by(CustomerID,ProductID) %>% summarise(NbTrx = n()) %>%
mutate(ProductID = paste("NbTrxProd",ProductID,sep="")) %>%
spread(ProductID,NbTrx)
summer_trx_cust_nb <- summer_trx %>% group_by(CustomerID) %>% summarise(NbTrx = n())
basetable_summer = basetable_summer %>%
left_join(summer_sess_cust_stat, by = 'CustomerID') %>%
left_join(summer_sess_cust_nb, by = 'CustomerID') %>%
left_join(summer_trx_cust_stat, by = 'CustomerID') %>%
left_join(summer_trx_cust_nb, by = 'CustomerID')
basetable_summer[is.na(basetable_summer)] <- 0
basetable_summer
basetable_summer %>% summarise_all(funs(mean))
basetable_summer %>% group_by(CustomerType) %>% summarise_all(funs(mean))
Quick calculation of the transaction churn rate
cbind(summer_trx %>% summarise(Summer_Clients = n_distinct(CustomerID)),
fall_trx %>% filter(CustomerID %in% unique(summer_trx$CustomerID)) %>%
summarise(Fall_NotChurners = n_distinct(CustomerID))
) %>% mutate(ChurnRate = 1-Fall_NotChurners/Summer_Clients)
We create the CLV calculation formula
calc_clv<-function(margin,r,d,acquisition,t)
{
clv<- -acquisition
for(i in 0:t)#attention: start in year 0
{
clv<-clv+((r^i)*margin/(1+d)^i)
}
return (clv)
}
Assumptions : - acquisition = 0 We don’t have Acquisition Cost - margin = Monetary We use the monetary value of each user - r (Retention probability) = average churn = 0.657
- t (Time) = 4 Corresponds to 4 seasons = 1 year (churn is quarterly) - d (Discount Rate) = 0,1 Quarterly discount rate
basetable_summer$clv <- apply(basetable_summer[,c("Recency","Frequency","Monetary")],1,
function(x)calc_clv(x[3],0.657,0.1,0,4))
Now we have a new variable which is the CLV for each user.
basetable_summer[,c("CustomerID","Duration","clv")]
NB : We can also create a CLV based on time spent on the game. The churn rate needs to be calculated on session tables and the margin is now the time spent
Sessions average churn
cbind(summer_sess %>% summarise(Summer_Users = n_distinct(CustomerID)),
fall_sess %>% filter(CustomerID %in% unique(summer_trx$CustomerID)) %>%
summarise(Fall_NotChurners = n_distinct(CustomerID))
) %>% mutate(ChurnRate = 1-Fall_NotChurners/Summer_Users)
basetable_summer$clv_time <- apply(basetable_summer[,c("Recency","Frequency","Duration")],1,
function(x)calc_clv(x[3],0.657,0.1,0,4))
basetable_summer[,c("CustomerID","Duration","clv_time")]
ggplot(basetable_summer, aes(x=Frequency, colour=CustomerType)) +
theme_bw() +
scale_x_continuous(breaks=c(1:30)) +
geom_bar(alpha=0.6) +
ggtitle("Distribution by Frequency")
ggplot(basetable_summer, aes(x=Recency, colour=CustomerType)) +
theme_bw() +
geom_bar(alpha=0.6) +
ggtitle("Distribution by Recency")
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Monetary, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Age, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Pokestops, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Experience, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Gyms, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Raids, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Social, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Pokemons, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = Distance, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer$CustomerType <- as.factor(basetable_summer$CustomerType)
ggplot(basetable_summer)+ aes(x = CustomerType, y = clv, colour=CustomerType) +
theme_bw() +
geom_boxplot()
basetable_summer <- basetable_summer %>%
mutate(segm.freq=ifelse(between(Frequency, 1, 1), '1',
ifelse(between(Frequency, 2, 2), '2',
ifelse(between(Frequency, 3, 3), '3',
ifelse(between(Frequency, 4, 4), '4',
ifelse(between(Frequency, 5, 5), '5','>5')))))) %>%
mutate(segm.rec=ifelse(between(Recency, 0, 5), '0-5 days',
ifelse(between(Recency, 6, 10), '6-10 days',
ifelse(between(Recency, 11, 20), '11-20 days',
ifelse(between(Recency, 21, 30), '21-30 days',
ifelse(between(Recency, 31, 50), '31-50 days', '>50 days')))))) %>%
mutate(Sex = ifelse(Sex==1,"Women","Men"))
# spliting into discrete groups with levels to make & identify grids later on
basetable_summer$segm.freq <- factor(basetable_summer$segm.freq, levels=c('>5', '5', '4', '3', '2', '1'))
basetable_summer$segm.rec <- factor(basetable_summer$segm.rec, levels=c('>50 days', '31-50 days', '21-30 days', '11-20 days', '6-10 days', '0-5 days'))
basetable_summer$Sex <- factor(basetable_summer$Sex, levels=c("Men","Women"))
summary(basetable_summer)
## CustomerID Recency Frequency Recency_Transac
## Min. : 1 Min. : 0.00 Min. : 1.000 Min. : -1.00
## 1st Qu.:1252 1st Qu.: 8.00 1st Qu.: 2.000 1st Qu.: -1.00
## Median :2506 Median : 21.00 Median : 4.000 Median : -1.00
## Mean :2502 Mean : 29.82 Mean : 4.243 Mean : 19.77
## 3rd Qu.:3751 3rd Qu.: 43.00 3rd Qu.: 6.000 3rd Qu.: 32.00
## Max. :5000 Max. :121.00 Max. :18.000 Max. :121.00
## Frequency_Transac Monetary CustomerType Registrationdate
## Min. :0.000 Min. : 0.000 1:1225 Min. :2016-07-01
## 1st Qu.:0.000 1st Qu.: 0.000 2:1144 1st Qu.:2016-12-07
## Median :0.000 Median : 0.000 3:1069 Median :2017-12-16
## Mean :0.473 Mean : 4.179 4:1279 Mean :2017-08-10
## 3rd Qu.:1.000 3rd Qu.: 4.990 3rd Qu.:2018-03-01
## Max. :5.000 Max. :200.990 Max. :2018-04-30
## Sex Age fallbonus Income
## Men :2815 Min. : 6.00 Min. :0.0000 Min. :1.000
## Women:1902 1st Qu.:18.00 1st Qu.:0.0000 1st Qu.:1.000
## Median :27.00 Median :0.0000 Median :2.000
## Mean :27.58 Mean :0.2006 Mean :2.014
## 3rd Qu.:33.00 3rd Qu.:0.0000 3rd Qu.:3.000
## Max. :80.00 Max. :1.0000 Max. :3.000
## Experience Pokestops Gyms Raids
## Min. : 1767 Min. : 9.00 Min. : 2.00 Min. : 0.000
## 1st Qu.: 9036 1st Qu.: 34.00 1st Qu.: 14.00 1st Qu.: 0.000
## Median : 15738 Median : 57.00 Median : 26.00 Median : 1.000
## Mean : 19110 Mean : 65.93 Mean : 31.48 Mean : 3.098
## 3rd Qu.: 25097 3rd Qu.: 91.00 3rd Qu.: 43.00 3rd Qu.: 4.000
## Max. :104901 Max. :263.00 Max. :150.00 Max. :53.000
## Social Pokemons Distance Duration
## Min. : 0.000 Min. : 4.00 Min. : 0.383 Min. : 10.77
## 1st Qu.: 1.000 1st Qu.: 36.00 1st Qu.: 5.084 1st Qu.: 86.75
## Median : 2.000 Median : 66.00 Median : 10.217 Median : 155.45
## Mean : 4.898 Mean : 78.87 Mean : 14.969 Mean : 278.67
## 3rd Qu.: 5.000 3rd Qu.:107.00 3rd Qu.: 19.762 3rd Qu.: 285.93
## Max. :60.000 Max. :465.00 Max. :117.339 Max. :2353.39
## NbSess NbTrxProd1 NbTrxProd2 NbTrxProd3
## Min. : 1.000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 2.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 4.000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 4.243 Mean :0.1749 Mean :0.1556 Mean :0.08204
## 3rd Qu.: 6.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :18.000 Max. :3.0000 Max. :3.0000 Max. :2.00000
## NbTrxProd4 NbTrxProd5 NbTrx clv
## Min. :0.000 Min. :0.00000 Min. :0.000 Min. : 0.000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.: 0.000
## Median :0.000 Median :0.00000 Median :0.000 Median : 0.000
## Mean :0.053 Mean :0.00742 Mean :0.473 Mean : 9.587
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.: 11.449
## Max. :2.000 Max. :2.00000 Max. :5.000 Max. :461.138
## clv_time segm.freq segm.rec
## Min. : 24.7 >5:1243 >50 days :937
## 1st Qu.: 199.0 5 : 496 31-50 days:820
## Median : 356.6 4 : 654 21-30 days:658
## Mean : 639.4 3 : 830 11-20 days:864
## 3rd Qu.: 656.0 2 : 831 6-10 days :555
## Max. :5399.5 1 : 663 0-5 days :883
lcg.product <- summer_trx %>%
left_join(customer,by="CustomerID")%>% select(CustomerID,ProductID,Value,CustomerType)
Distribution of product with customer Type (might be usefull for after)
lcg.product2 <- lcg.product %>%
group_by(ProductID) %>%
summarise(Monetary= sum(Value),Type1= sum(ifelse(CustomerType==1,1,0)),Type2= sum(ifelse(CustomerType==2,1,0)),Type3= sum(ifelse(CustomerType==3,1,0)),Type4= sum(ifelse(CustomerType==4,1,0)))
lcg.product2
Life cycle grid by quantity , frequency , recency
lcg <- basetable_summer %>%
group_by(segm.rec, segm.freq) %>%
summarise(quantity=n(),money= sum(Monetary)) %>%
mutate(player="player") %>%
ungroup()
ggplot(lcg, aes(x=player,y=quantity, fill=quantity)) +
theme_bw() +
theme(panel.grid = element_blank())+
geom_bar(aes(y=quantity), stat='identity', alpha=0.5) +
geom_text(aes(y=max(quantity)/2, label=round(quantity,0)), size=4) +
facet_grid(segm.freq ~ segm.rec) +
theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
ggtitle("Life Cycle Grids - Number of player per Recency & Frequency")
Life cycle grid by monetary , frequency , recency
ggplot(lcg, aes(x=player,y=money, fill=money)) +
theme_bw() +
theme(panel.grid = element_blank())+
geom_bar(aes(y=money), stat='identity', alpha=0.5) +
geom_text(aes(y=max(money)/2, label=round(money,0)), size=4) +
facet_grid(segm.freq ~ segm.rec) +
theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
ggtitle("Life Cycle Grids - Money earned per Recency & Frequency")
Life cycle grid by gender, frequency , recency
lcg.gender <- basetable_summer %>%
group_by(segm.rec, segm.freq, Sex) %>%
summarise(quantity=n(),money= sum(Monetary)) %>%
mutate(player="player") %>%
ungroup()
ggplot(lcg.gender, aes(x=Sex,y=quantity, fill=Sex)) +
theme_bw() +
theme(panel.grid = element_blank())+
geom_bar(aes(y=quantity), stat='identity', alpha=0.5) +
geom_text(aes(y=max(quantity)/2, label=round(quantity,0)), size=4) +
facet_grid(segm.freq ~ segm.rec) +
theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
ggtitle("Life Cycle Grids - Gender per Recency & Frequency")
Life cycle grid by gender, frequency , recency
ggplot(lcg.gender, aes(x=Sex,y=money, fill=Sex)) +
theme_bw() +
theme(panel.grid = element_blank())+
geom_bar(aes(y=money), stat='identity', alpha=0.5) +
geom_text(aes(y=max(money)/2, label=round(money,0)), size=4) +
facet_grid(segm.freq ~ segm.rec) +
theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
ggtitle("Life Cycle Grids - Money earned per Gender, Recency & Frequency")
Life cycle grid by customer type and Quantity
lcg.test <- basetable_summer %>%
group_by(CustomerType, segm.rec, segm.freq) %>%
# calculating cumulative values
summarise(quantity=n(),
money= sum(Monetary)) %>%
ungroup()
ggplot(lcg.test, aes(x=CustomerType, fill=CustomerType)) +
theme_bw() +
theme(panel.grid = element_blank())+
geom_bar(aes(y=quantity), stat='identity', alpha=0.5) +
geom_text(aes(y=quantity, label=round(quantity,0)), size=3) +
facet_grid(segm.freq ~ segm.rec) +
theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
ggtitle("LifeCycle Grids - Number of player per Customer type, Recency & Frequency ")
Life cycle grid by customer type and Moneytary value
ggplot(lcg.test, aes(x=CustomerType, fill=CustomerType)) +
theme_bw() +
theme(panel.grid = element_blank())+
geom_bar(aes(y=money), stat='identity', alpha=0.5) +
geom_text(aes(y=money, label=round(money,0)), size=2) +
facet_grid(segm.freq ~ segm.rec) +
theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
ggtitle("LifeCycle Grids - Money earned per Customer type, Recency & Frequency ")
Life cycle grid by customer type, gender and Quantity
lcg.test.gender <- basetable_summer %>%
group_by(CustomerType, segm.rec, segm.freq,Sex) %>%
# calculating cumulative values
summarise(quantity=n(),
money= sum(Monetary)) %>%
ungroup()
ggplot(lcg.test.gender, aes(x=CustomerType,y=quantity, fill=Sex)) +
theme_bw() +
theme(panel.grid = element_blank())+
geom_bar(aes(y=quantity), stat='identity', alpha=0.5) +
#geom_text(aes(y=quantity, label=round(quantity,0)), size=2) +
facet_grid(segm.freq ~ segm.rec) +
theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
ggtitle("Number of player per Customer type, Gender, Recency & Frequency")
Life cycle grid by customer type, gender and Monetary Value
ggplot(lcg.test.gender, aes(x=CustomerType, fill=Sex)) +
theme_bw() +
theme(panel.grid = element_blank())+
geom_bar(aes(y=money), stat='identity', alpha=0.5) +
#geom_text(aes(y=money, label=round(money,0)), size=2) +
facet_grid(segm.freq ~ segm.rec) +
theme(axis.text.x=element_text(angle=90, hjust=.5, vjust=.5, face="plain")) +
ggtitle("LifeCycle Grids - Money earned per Customer type, Recency & Frequency ")
# Status Encoding :
# 0 = No Sessions
# 1 = Sessions Only
# 2 = Transactions
churn_status <- customer %>% select(CustomerID)
churn_status[churn_status$CustomerID %in% unique(summer_sess$CustomerID),"summer_status"] <- 1
churn_status[churn_status$CustomerID %in% unique(summer_trx$CustomerID),"summer_status"] <- 2
churn_status[churn_status$CustomerID %in% unique(fall_sess$CustomerID),"fall_status"] <- 1
churn_status[churn_status$CustomerID %in% unique(fall_trx$CustomerID),"fall_status"] <- 2
churn_status[is.na(churn_status)] <- 0
# Churn Encoding :
# 0 = No churn
# 1 = Partial churn (transactions -> sessions only)
# 2 = Total churn (transactions -> no sessions)
# Churn from trx to sess or no sess
churn_status[churn_status$summer_status == 2 & churn_status$fall_status == 2,"churn_trx"] <- 0
churn_status[churn_status$summer_status == 2 & churn_status$fall_status != 2,"churn_trx"] <- 1
# Churn from sess (including trx) to no sess
churn_status[churn_status$summer_status != 0 & churn_status$fall_status != 0,"churn_sess"] <- 0
churn_status[churn_status$summer_status != 0 & churn_status$fall_status == 0,"churn_sess"] <- 1
# Churn from trx to no sess
churn_status[churn_status$summer_status == 2 & churn_status$fall_status != 0,"churn_total"] <- 0
churn_status[churn_status$summer_status == 2 & churn_status$fall_status == 0,"churn_total"] <- 1
churn_status <- churn_status %>% filter(!(summer_status == 0 & fall_status == 0))
churn_status
churn_matrix <- matrix(nrow = 3, ncol = 3)
for(i in 1:3){
for(j in 1:3){
value = churn_status %>%
filter(summer_status == i-1 & fall_status == j-1) %>%
summarise(value = n_distinct(CustomerID))
churn_matrix[i,j] <- value$value
}
}
churn_matrix <- data.frame(churn_matrix)
rownames(churn_matrix) <- c("Inflow","Summer_sessions","Summer_transactions")
colnames(churn_matrix) <- c("Outflow","Fall_sessions","Fall_transactions")
churn_matrix <- churn_matrix[c("Summer_sessions","Summer_transactions","Inflow"),
c("Fall_sessions","Fall_transactions","Outflow")]
churn_matrix <- rbind(churn_matrix, data.frame(as.list(colSums(churn_matrix))))
churn_matrix <- cbind(churn_matrix,t(data.frame(as.list(rowSums(churn_matrix)))))
colnames(churn_matrix) <- c(colnames(churn_matrix)[-4],"Total")
rownames(churn_matrix) <- c(rownames(churn_matrix)[-4],"Total")
churn_matrix
We are going to update the basetable with the churn information
basetable_churn <- basetable_summer %>% left_join(churn_status, by='CustomerID') %>%
select(-c("NbTrx","NbSess","summer_status","fall_status"))
basetable_churn <- basetable_churn %>%
mutate(Type1= ifelse(CustomerType==1,1,0), Type2 =ifelse(CustomerType==2,1,0),Type3 =ifelse(CustomerType==3,1,0) ) %>%
select(-c("CustomerType","segm.freq","segm.rec"))
basetable_churn_trx <- basetable_churn[!is.na(basetable_churn$churn_trx),] %>% select(-c("churn_sess","churn_total"))
basetable_churn_sess <- basetable_churn[!is.na(basetable_churn$churn_sess),] %>% select(-c("churn_trx","churn_total"))
basetable_churn_total <- basetable_churn[!is.na(basetable_churn$churn_total),] %>% select(-c("churn_trx","churn_sess"))
basetable_churn_total
basetable_churn_trx %>% summarise(mean(churn_trx))
We have 66% of churn in the database it is HUGE !
Let’s split the dataset to perform our logisitic regression
set.seed(654)
split <- sample.split(basetable_churn_trx$CustomerID, SplitRatio = 0.70)
training <- subset(basetable_churn_trx, split == TRUE)
validation <- subset(basetable_churn_trx, split == FALSE)
modeltraining = glm(churn_trx~.,family=binomial(link='logit'),data=training)
summary(modeltraining)
##
## Call:
## glm(formula = churn_trx ~ ., family = binomial(link = "logit"),
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1489 -1.0835 0.6986 0.8569 1.6772
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.550e+00 6.089e+00 -0.911 0.3620
## CustomerID -4.051e-05 4.565e-05 -0.887 0.3748
## Recency 3.277e-03 2.891e-03 1.133 0.2571
## Frequency -1.692e-01 2.053e-01 -0.824 0.4099
## Recency_Transac -2.838e-03 1.941e-03 -1.463 0.1436
## Frequency_Transac 2.683e-01 3.068e-01 0.874 0.3819
## Monetary -1.653e-03 6.719e-03 -0.246 0.8057
## Registrationdate 4.013e-04 3.443e-04 1.165 0.2439
## SexWomen -9.028e-02 1.329e-01 -0.679 0.4969
## Age -1.619e-03 4.810e-03 -0.336 0.7365
## fallbonus -1.441e+00 1.564e-01 -9.214 <2e-16 ***
## Income -5.358e-02 7.938e-02 -0.675 0.4997
## Experience -2.178e-01 1.012e-01 -2.152 0.0314 *
## Pokestops 1.090e+01 5.062e+00 2.153 0.0313 *
## Gyms 4.356e+01 2.024e+01 2.152 0.0314 *
## Raids 2.178e+02 1.012e+02 2.152 0.0314 *
## Social 4.357e+01 2.024e+01 2.152 0.0314 *
## Pokemons 1.089e+01 5.061e+00 2.151 0.0315 *
## Distance 2.179e+01 1.012e+01 2.152 0.0314 *
## Duration -1.366e-04 6.576e-04 -0.208 0.8355
## NbTrxProd1 -1.012e-01 2.877e-01 -0.352 0.7251
## NbTrxProd2 -3.116e-01 2.789e-01 -1.117 0.2639
## NbTrxProd3 -2.688e-01 2.696e-01 -0.997 0.3188
## NbTrxProd4 NA NA NA NA
## NbTrxProd5 NA NA NA NA
## clv NA NA NA NA
## clv_time NA NA NA NA
## Type1 -1.027e-01 3.300e-01 -0.311 0.7556
## Type2 6.630e-02 3.024e-01 0.219 0.8265
## Type3 8.713e-01 3.864e-01 2.255 0.0241 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1534.7 on 1196 degrees of freedom
## Residual deviance: 1406.3 on 1171 degrees of freedom
## AIC: 1458.3
##
## Number of Fisher Scoring iterations: 4
We only keep significant variables thanks to the stepwise selection
#select model which optimizes BIC criteria
model.backward <- step(modeltraining,direction="backward",k=log(nrow(training)),trace=0)
pred.backward <- predict(model.backward, newdata=validation, type="response")
summary(model.backward)
##
## Call:
## glm(formula = churn_trx ~ fallbonus + Experience + Type3, family = binomial(link = "logit"),
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0147 -1.0446 0.7517 0.8456 1.6397
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.211e+00 1.246e-01 9.716 < 2e-16 ***
## fallbonus -1.389e+00 1.528e-01 -9.095 < 2e-16 ***
## Experience -1.975e-05 5.139e-06 -3.844 0.000121 ***
## Type3 7.673e-01 1.788e-01 4.292 1.77e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1534.7 on 1196 degrees of freedom
## Residual deviance: 1426.3 on 1193 degrees of freedom
## AIC: 1434.3
##
## Number of Fisher Scoring iterations: 4
pred <- predict(model.backward, type = "response", newdata = validation)
validation$prob_churn <- pred
# Using probability cutoff of 67%.
pred_churn <- factor(ifelse(pred >= 0.67, "Yes", "No"))
actual_churn <- factor(ifelse(validation$churn==1,"Yes","No"))
table(actual_churn,pred_churn)
## pred_churn
## actual_churn No Yes
## No 80 99
## Yes 91 244
PCC = mean(actual_churn==pred_churn)
PCC
## [1] 0.6303502
Let’s split the dataset to perform our logisitic regression
set.seed(654)
split <- sample.split(basetable_churn_sess$CustomerID, SplitRatio = 0.70)
training <- subset(basetable_churn_sess, split == TRUE)
validation <- subset(basetable_churn_sess, split == FALSE)
modeltraining = glm(churn_sess~.,family=binomial(link='logit'),data=training)
summary(modeltraining)
##
## Call:
## glm(formula = churn_sess ~ ., family = binomial(link = "logit"),
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2182 -0.8128 -0.6336 1.2221 2.6356
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.931e+00 3.757e+00 -2.111 0.03479 *
## CustomerID 5.859e-06 2.845e-05 0.206 0.83685
## Recency -1.579e-03 1.634e-03 -0.966 0.33394
## Frequency 7.651e-02 1.440e-01 0.531 0.59507
## Recency_Transac -3.411e-03 1.774e-03 -1.923 0.05448 .
## Frequency_Transac -8.907e-01 3.780e-01 -2.356 0.01845 *
## Monetary 1.291e-02 7.881e-03 1.638 0.10145
## Registrationdate 4.294e-04 2.127e-04 2.018 0.04355 *
## SexWomen -5.999e-02 8.419e-02 -0.712 0.47618
## Age -4.236e-03 3.144e-03 -1.347 0.17791
## fallbonus -2.368e-01 1.058e-01 -2.237 0.02527 *
## Income -5.754e-02 5.032e-02 -1.143 0.25283
## Experience 1.486e-02 7.175e-02 0.207 0.83590
## Pokestops -7.630e-01 3.587e+00 -0.213 0.83157
## Gyms -2.975e+00 1.435e+01 -0.207 0.83578
## Raids -1.485e+01 7.174e+01 -0.207 0.83607
## Social -2.954e+00 1.435e+01 -0.206 0.83690
## Pokemons -7.346e-01 3.587e+00 -0.205 0.83775
## Distance -1.474e+00 7.174e+00 -0.205 0.83725
## Duration -3.359e-04 4.803e-04 -0.699 0.48436
## NbTrxProd1 6.997e-01 3.703e-01 1.889 0.05886 .
## NbTrxProd2 4.781e-01 3.620e-01 1.321 0.18661
## NbTrxProd3 5.341e-01 3.559e-01 1.501 0.13343
## NbTrxProd4 NA NA NA NA
## NbTrxProd5 NA NA NA NA
## clv NA NA NA NA
## clv_time NA NA NA NA
## Type1 5.842e-02 2.015e-01 0.290 0.77191
## Type2 5.258e-01 1.753e-01 2.999 0.00271 **
## Type3 -6.394e-01 2.466e-01 -2.593 0.00952 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3742.4 on 3300 degrees of freedom
## Residual deviance: 3553.5 on 3275 degrees of freedom
## AIC: 3605.5
##
## Number of Fisher Scoring iterations: 4
We only keep significant variables thanks to the stepwise selection
#select model which optimizes BIC criteria
model.backward <- step(modeltraining,direction="backward",k=log(nrow(training)),trace=0)
pred.backward <- predict(model.backward, newdata=validation, type="response")
summary(model.backward)
##
## Call:
## glm(formula = churn_sess ~ Frequency_Transac + Type2 + Type3,
## family = binomial(link = "logit"), data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0029 -0.8179 -0.6865 1.3625 2.5390
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.92324 0.05967 -15.473 < 2e-16 ***
## Frequency_Transac -0.40221 0.06726 -5.980 2.23e-09 ***
## Type2 0.49790 0.09272 5.370 7.88e-08 ***
## Type3 -0.65044 0.11972 -5.433 5.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3742.4 on 3300 degrees of freedom
## Residual deviance: 3598.5 on 3297 degrees of freedom
## AIC: 3606.5
##
## Number of Fisher Scoring iterations: 4
Let’s split the dataset to perform our logisitic regression
set.seed(654)
split <- sample.split(basetable_churn_total$CustomerID, SplitRatio = 0.70)
training <- subset(basetable_churn_total, split == TRUE)
validation <- subset(basetable_churn_total, split == FALSE)
modeltraining = glm(churn_total~.,family=binomial(link='logit'),data=training)
summary(modeltraining)
##
## Call:
## glm(formula = churn_total ~ ., family = binomial(link = "logit"),
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0945 -0.6843 -0.5245 -0.3346 2.6329
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.298e+01 7.227e+00 -1.797 0.0724 .
## CustomerID -7.309e-05 5.587e-05 -1.308 0.1908
## Recency 1.891e-03 3.171e-03 0.596 0.5510
## Frequency 2.635e-01 2.656e-01 0.992 0.3211
## Recency_Transac 1.453e-03 2.334e-03 0.622 0.5337
## Frequency_Transac -4.677e-02 3.732e-01 -0.125 0.9003
## Monetary 1.741e-03 7.851e-03 0.222 0.8245
## Registrationdate 6.762e-04 4.087e-04 1.655 0.0980 .
## SexWomen -6.845e-05 1.605e-01 0.000 0.9997
## Age -7.137e-04 5.977e-03 -0.119 0.9050
## fallbonus -1.140e+00 2.578e-01 -4.421 9.8e-06 ***
## Income -1.411e-02 9.490e-02 -0.149 0.8818
## Experience 6.512e-02 1.306e-01 0.498 0.6181
## Pokestops -3.280e+00 6.533e+00 -0.502 0.6156
## Gyms -1.304e+01 2.613e+01 -0.499 0.6176
## Raids -6.509e+01 1.306e+02 -0.498 0.6183
## Social -1.302e+01 2.613e+01 -0.498 0.6183
## Pokemons -3.246e+00 6.532e+00 -0.497 0.6192
## Distance -6.499e+00 1.306e+01 -0.497 0.6188
## Duration -1.107e-03 8.335e-04 -1.329 0.1840
## NbTrxProd1 1.962e-01 3.441e-01 0.570 0.5686
## NbTrxProd2 5.797e-02 3.367e-01 0.172 0.8633
## NbTrxProd3 6.322e-02 3.297e-01 0.192 0.8480
## NbTrxProd4 NA NA NA NA
## NbTrxProd5 NA NA NA NA
## clv NA NA NA NA
## clv_time NA NA NA NA
## Type1 3.291e-01 3.910e-01 0.842 0.4000
## Type2 4.867e-01 3.503e-01 1.389 0.1648
## Type3 -8.553e-02 4.597e-01 -0.186 0.8524
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1121.0 on 1196 degrees of freedom
## Residual deviance: 1050.6 on 1171 degrees of freedom
## AIC: 1102.6
##
## Number of Fisher Scoring iterations: 5
We only keep significant variables thanks to the stepwise selection
#select model which optimizes BIC criteria
model.backward <- step(modeltraining,direction="backward",k=log(nrow(training)),trace=0)
pred.backward <- predict(model.backward, newdata=validation, type="response")
summary(model.backward)
##
## Call:
## glm(formula = churn_total ~ fallbonus + Gyms + Type2, family = binomial(link = "logit"),
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9692 -0.6795 -0.5614 -0.3643 2.4906
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.000920 0.164362 -6.090 1.13e-09 ***
## fallbonus -1.142814 0.255289 -4.477 7.59e-06 ***
## Gyms -0.014471 0.003889 -3.721 0.000198 ***
## Type2 0.518296 0.180561 2.870 0.004099 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1121.0 on 1196 degrees of freedom
## Residual deviance: 1066.4 on 1193 degrees of freedom
## AIC: 1074.4
##
## Number of Fisher Scoring iterations: 5